Test Survey with ML¶

In [1]:
!pip install matplotlib
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', None)  # or 199
np.set_printoptions(threshold=np.inf)
pd.set_option('display.max_columns', None)

df = pd.read_csv("New Headphone Production Survey for ML.csv")
df
Requirement already satisfied: matplotlib in /srv/conda/envs/notebook/lib/python3.10/site-packages (3.9.2)
Requirement already satisfied: contourpy>=1.0.1 in /srv/conda/envs/notebook/lib/python3.10/site-packages (from matplotlib) (1.3.0)
Requirement already satisfied: cycler>=0.10 in /srv/conda/envs/notebook/lib/python3.10/site-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /srv/conda/envs/notebook/lib/python3.10/site-packages (from matplotlib) (4.54.1)
Requirement already satisfied: kiwisolver>=1.3.1 in /srv/conda/envs/notebook/lib/python3.10/site-packages (from matplotlib) (1.4.7)
Requirement already satisfied: numpy>=1.23 in /srv/conda/envs/notebook/lib/python3.10/site-packages (from matplotlib) (1.26.4)
Requirement already satisfied: packaging>=20.0 in /srv/conda/envs/notebook/lib/python3.10/site-packages (from matplotlib) (24.1)
Requirement already satisfied: pillow>=8 in /srv/conda/envs/notebook/lib/python3.10/site-packages (from matplotlib) (10.4.0)
Requirement already satisfied: pyparsing>=2.3.1 in /srv/conda/envs/notebook/lib/python3.10/site-packages (from matplotlib) (3.1.4)
Requirement already satisfied: python-dateutil>=2.7 in /srv/conda/envs/notebook/lib/python3.10/site-packages (from matplotlib) (2.9.0)
Requirement already satisfied: six>=1.5 in /srv/conda/envs/notebook/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)
Out[1]:
Gender Age Status Education Occupation NumberHps ActivityHps TimeHps PriceHps PlaceHps FactorHps HealthHps InnovationHps InfoHps_online InfoHps_Social InfoHps_Google InfoHps_Store InfoHps_PR InfoHps_Ads PB_nosound PB_disconnect PB_badsound PB_unfit PB_oneear PB_toosd PB_audiocut PB_battery PB_unplug HpAnswer
0 0 2 0 1 2 0 0 1 1 1 2 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 2
1 0 2 1 1 3 0 4 0 2 0 2 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 2
2 1 3 1 2 2 0 5 0 2 0 0 0 2 1 1 0 0 0 0 0 1 1 0 0 0 1 0 0 2
3 0 2 0 1 2 1 2 0 2 0 2 0 0 0 0 1 0 1 0 1 1 0 0 1 0 0 0 0 2
4 1 2 0 1 2 1 0 0 4 0 0 1 1 1 1 0 1 0 0 0 0 1 0 0 0 0 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
67 1 1 0 2 2 1 0 0 4 0 0 1 3 1 1 1 0 1 0 0 1 0 0 0 0 0 0 0 3
68 0 1 0 2 2 0 2 0 4 1 2 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 3
69 1 2 1 2 2 0 0 0 2 1 2 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 3
70 0 1 0 2 3 1 0 0 4 1 3 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 3
71 0 1 0 2 4 0 0 0 2 1 1 0 3 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 3

72 rows × 29 columns

In [2]:
# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


# df3 = pd.concat([df]*20, ignore_index=True)

# train_df,test_df = train_test_split(df3, test_size=0.3, random_state=42, shuffle=True)

# X_train = train_df.drop("HpAnswer", axis=1)
# Y_train = train_df["HpAnswer"]
# X_test  = test_df.drop("HpAnswer", axis=1)
# Y_test  = test_df["HpAnswer"]
In [3]:
#importanct features

df3 = pd.concat([df]*20, ignore_index=True)

train_df,test_df = train_test_split(df3, test_size=0.3, random_state=42, shuffle=True)

X_train = train_df.drop(labels=['Gender','Age','Status','Education','Occupation','HpAnswer'],axis=1)
Y_train = train_df["HpAnswer"]
X_test  = test_df.drop(labels=['Gender','Age','Status','Education','Occupation','HpAnswer'],axis=1)
Y_test  = test_df["HpAnswer"]

# X_train = train_df.drop("HpAnswer", axis=1)
# Y_train = train_df["HpAnswer"]
# X_test  = test_df.drop("HpAnswer", axis=1)
# Y_test  = test_df["HpAnswer"]

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)
Y3           = random_forest.predict_proba(X_test)

random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
#print(classification_report(Y_test, Y_prediction))
#print(acc_random_forest)
#print(Y_prediction,Y3)

importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(random_forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
#importances_cut = importances.loc[importances['feature']!=['Gender','Age','Status','Education','Occupation']]
#importances_qicks = importances.drop(labels=['Gender','Age','Status','Education','Occupation'],axis=0)
#importances_qicks = importances_qicks.sort_values(by=['importance'])
importances
Out[3]:
importance
feature
FactorHps 0.131
ActivityHps 0.092
PriceHps 0.091
InnovationHps 0.083
PlaceHps 0.066
InfoHps_PR 0.048
InfoHps_Google 0.042
InfoHps_Store 0.039
InfoHps_online 0.039
PB_badsound 0.039
PB_disconnect 0.038
HealthHps 0.038
PB_battery 0.034
NumberHps 0.034
TimeHps 0.031
InfoHps_Social 0.030
PB_nosound 0.026
PB_audiocut 0.026
PB_oneear 0.022
PB_toosd 0.022
InfoHps_Ads 0.016
PB_unplug 0.014
PB_unfit 0.000
In [4]:
# X_train = train_df[['Gender','Age','Status','Education','Occupation']]
# Y_train = train_df["HpAnswer"]
# X_test = test_df[['Gender','Age','Status','Education','Occupation']]
# Y_test  = test_df["HpAnswer"]
#      colname=['Gender','Age','Status','Education','Occupation','HpAnswer','NumberHps', 'ActivityHps', 'TimeHps','PriceHps','PlaceHps','FactorHps','HealthHps','InnovationHps',\
#              "PB_nosound","PB_disconnect","PB_badsound","PB_unfit","PB_oneear","PB_toosd","PB_audiocut","PB_battery",\
#              "PB_unplug","InfoHps_online","InfoHps_Social","InfoHps_Google","InfoHps_Store","InfoHps_PR","InfoHps_Ads"]
colname=['ActivityHps','PriceHps','PlaceHps','FactorHps','InnovationHps',\
    "InfoHps_online","InfoHps_Store","InfoHps_PR"]
X_train = train_df[colname]
Y_train = train_df["HpAnswer"]
X_test  = test_df[colname]
Y_test  = test_df["HpAnswer"]

# X_train = train_df[['FactorHps','PriceHps','ActivityHps','InnovationHps','PlaceHps',"InfoHps_PR","InfoHps_online","InfoHps_Store"]]
# Y_train = train_df["HpAnswer"]
# X_test = test_df[['FactorHps','PriceHps','ActivityHps','InnovationHps','PlaceHps',"InfoHps_PR","InfoHps_online","InfoHps_Store"]]
# Y_test  = test_df["HpAnswer"]

# X_train = train_df.drop(['Gender','Age','Status','Education','Occupation','HpAnswer'], axis=1)
# Y_train = train_df["HpAnswer"]
# X_test  = test_df.drop(['Gender','Age','Status','Education','Occupation','HpAnswer'], axis=1)
# Y_test  = test_df["HpAnswer"]

# X_train = train_df.drop("HpAnswer", axis=1)
# Y_train = train_df["HpAnswer"]
# X_test  = test_df.drop("HpAnswer", axis=1)
# Y_test  = test_df["HpAnswer"]
In [5]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)
Y3           = random_forest.predict_proba(X_test)

random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
In [6]:
sgd = linear_model.SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)

sgd.score(X_train, Y_train)

acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
In [7]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)
Y3     = logreg.predict_proba(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
#print(classification_report(Y_test, Y_pred))
#print(acc_log)
#print(Y_pred,Y3)
In [8]:
# KNN 
knn = KNeighborsClassifier(n_neighbors = 3) 
knn.fit(X_train, Y_train)  
Y_pred = knn.predict(X_test)  
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
In [9]:
gaussian = GaussianNB() 
gaussian.fit(X_train, Y_train)  
Y_pred = gaussian.predict(X_test)  
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
In [10]:
perceptron = Perceptron(max_iter=5)
perceptron.fit(X_train, Y_train)

Y_pred = perceptron.predict(X_test)

acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
/srv/conda/envs/notebook/lib/python3.10/site-packages/sklearn/linear_model/_stochastic_gradient.py:702: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn(
In [11]:
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)

Y_pred = linear_svc.predict(X_test)

acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
/srv/conda/envs/notebook/lib/python3.10/site-packages/sklearn/svm/_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
In [12]:
decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_train, Y_train)  
Y_pred = decision_tree.predict(X_test)  
Y3           = decision_tree.predict_proba(X_test)

acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
#print(classification_report(Y_test, Y_pred))
#print(acc_log)
#print(Y_pred,Y3)
In [13]:
results = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 
              'Decision Tree'],
    'Score': [acc_linear_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)
Out[13]:
Model
Score
98.61 KNN
98.61 Random Forest
98.61 Decision Tree
71.83 Naive Bayes
65.08 Logistic Regression
64.29 Support Vector Machines
64.09 Stochastic Gradient Decent
45.34 Perceptron
In [14]:
importances.plot.bar()
Out[14]:
<Axes: xlabel='feature'>
No description has been provided for this image
In [15]:
from dash import dcc

def feature_importance():
    return importances
colnames=['FactorHps','TimeHps','PriceHps','HealthHps']
xts = pd.DataFrame(np.array([[1,1,1,1]]),columns=colnames)
#xts = np.unique(['NumberHps', 'NumberHps', 'TimeHps'])
xts
Out[15]:
FactorHps TimeHps PriceHps HealthHps
0 1 1 1 1
In [16]:
def predict_input(que1,que2,que3,que4,ans1,ans2,ans3,ans4):  
    X1=({que1: [ans1],
        que2: [ans2],
        que3: [ans3],
        que4: [ans4]})

    #X1 = ({'Gender':[0], 'Age': [2], 'Status':[0], 'Education':[1],'Occupation': [2]})
    
    dfs = pd.DataFrame(X1)
    
    return dfs
In [17]:
def MLdecision_tree(xtest):
    X_train = train_df[xtest.columns.values]
    Y_train = train_df["HpAnswer"]
    random_forest = RandomForestClassifier(n_estimators=100)
    
    X_test  = test_df[xtest.columns.values]
    Y_test  = test_df["HpAnswer"]
    random_forest.fit(X_train, Y_train)
    Y_pred = random_forest.predict(X_test)
    Y3           = random_forest.predict_proba(X_test)
    
    Y_train = train_df["HpAnswer"]
    random_forest.fit(X_train, Y_train)  
    Y_pred_hp = random_forest.predict(xtest)  
    Y3           = random_forest.predict_proba(xtest)
    acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
    print("acc_random_forest_hp:",acc_random_forest)
    
    Y_train = train_df["Gender"]
    random_forest.fit(X_train, Y_train)  
    Y_pred_gender = random_forest.predict(xtest)  
    acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
    print("acc_random_forest_gender:",acc_random_forest)
    
    Y_train = train_df["Age"]
    random_forest.fit(X_train, Y_train)  
    Y_pred_age = random_forest.predict(xtest)  
    acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
    print("acc_random_forest_age:",acc_random_forest)
    
    Y_train = train_df["Status"]
    random_forest.fit(X_train, Y_train)  
    Y_pred_status = random_forest.predict(xtest)
    acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
    print("acc_random_forest_status:",acc_random_forest)
    
    Y_train = train_df["Education"]
    random_forest.fit(X_train, Y_train)  
    Y_pred_edu = random_forest.predict(xtest)  
    acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
    print("acc_random_forest_edu:",acc_random_forest)
    
    Y_train = train_df["Occupation"]
    random_forest.fit(X_train, Y_train)  
    Y_pred_occ = random_forest.predict(xtest)
    acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
    print("acc_random_forest_occ:",acc_random_forest)

    if Y_pred_hp[0] == 1 : charac_hp = "HP1"
    if Y_pred_hp[0] == 2 : charac_hp = "HP2"
    if Y_pred_hp[0] == 3 : charac_hp = "HP3"

    if Y_pred_gender[0] == 0 : charac_gender = "female"
    if Y_pred_gender[0] == 1 : charac_gender = "male"

    if Y_pred_age[0] == 1 : charac_age = "20-27"
    if Y_pred_age[0] == 2 : charac_age = "28-35"
    if Y_pred_age[0] == 3 : charac_age = "36-45"

    if Y_pred_status[0] == 0 : charac_status = "single"
    if Y_pred_status[0] == 1 : charac_status = "married"

    if Y_pred_edu[0] == 1 : charac_edu = "graduate"
    if Y_pred_edu[0] == 2 : charac_edu = "undergraduate"
    if Y_pred_edu[0] == 3 : charac_edu = "high school"
    if Y_pred_edu[0] == 4 : charac_edu = "unspecified education"

    if Y_pred_occ[0] == 1 : charac_occ = "student"
    if Y_pred_occ[0] == 2 : charac_occ = "employee"
    if Y_pred_occ[0] == 3 : charac_occ = "business owner"
    if Y_pred_occ[0] == 4 : charac_occ = "unspecified occupation"

    #acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
    print(classification_report(Y_test, Y_pred))
    #print("acc_decision_tree:",label,acc_decision_tree)
    #print(Y_pred,Y3)
    return charac_hp,charac_gender,charac_age,charac_status,charac_edu,charac_occ,Y_pred_hp[0],Y_pred_gender[0],\
          Y_pred_age[0],Y_pred_status[0],Y_pred_occ[0],Y_pred_edu[0],Y3
In [18]:
MLdecision_tree(xts)
acc_random_forest_hp: 86.9
acc_random_forest_gender: 86.61
acc_random_forest_age: 83.23
acc_random_forest_status: 88.1
acc_random_forest_edu: 85.62
acc_random_forest_occ: 84.03
              precision    recall  f1-score   support

           1       1.00      0.76      0.86        58
           2       0.80      0.96      0.87       245
           3       0.89      0.66      0.76       129

    accuracy                           0.84       432
   macro avg       0.90      0.79      0.83       432
weighted avg       0.86      0.84      0.84       432

Out[18]:
('HP2',
 'female',
 '36-45',
 'married',
 'high school',
 'unspecified occupation',
 2,
 0,
 3,
 1,
 4,
 3,
 array([[0., 1., 0.]]))
In [19]:
dfs = predict_input('FactorHps','ActivityHps','PriceHps','InnovationHps',2,1,1,2)
dfs.columns.values
Out[19]:
array(['FactorHps', 'ActivityHps', 'PriceHps', 'InnovationHps'],
      dtype=object)
In [20]:
# X_train = train_df.drop("HpAnswer", axis=1)
# Y_train = train_df[["HpAnswer"]]
# X_test  = test_df.drop("HpAnswer", axis=1)
# Y_test  = test_df[["HpAnswer"]]
xt = pd.DataFrame([{'FactorHps':1,'PriceHps':2,'ActivityHps':3}])
xt.columns.values
Out[20]:
array(['FactorHps', 'PriceHps', 'ActivityHps'], dtype=object)
In [21]:
from sklearn.preprocessing import LabelBinarizer

label_binarizer = LabelBinarizer().fit(Y_train)
y_onehot_test = label_binarizer.transform(Y_test)
y_onehot_test.shape  # (n_samples, n_classes)

#label_binarizer.transform([1])
Out[21]:
(432, 3)
In [22]:
# class_of_interest = 1
# class_id = np.flatnonzero(label_binarizer.classes_ == class_of_interest)[0]
# class_id
In [23]:
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import make_classification

from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer
# df3 = pd.concat([df]*20, ignore_index=True)

# train_df,test_df = train_test_split(df3, test_size=0.3, random_state=42, shuffle=True)
# features = ['FactorHps','PriceHps','ActivityHps']
# X_train = train_df[features]
# Y_train = train_df["HpAnswer"]
# X_test  = test_df[features]
# Y_test  = test_df["HpAnswer"]

# decision_tree = DecisionTreeClassifier()
# decision_tree.fit(X_train, Y_train)
# y_score = decision_tree.fit(X_train, Y_train).predict_proba(X_test)

def getROCfigure(htype,features,label,tname):
    df3 = pd.concat([df]*20, ignore_index=True)

    train_df,test_df = train_test_split(df3, test_size=0.3, random_state=42, shuffle=True)

    X_train = train_df[features]
    Y_train = train_df[label]
    X_test  = test_df[features]
    Y_test  = test_df[label]
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(X_train, Y_train)
    if(label == "HpAnswer") or (label == "Age") or (label == "Education") or (label == "Occupation"):

        label_binarizer = LabelBinarizer().fit(Y_train)
        y_onehot_test = label_binarizer.transform(Y_test)
        y_onehot_test.shape  # (n_samples, n_classes)

        class_of_interest = int(htype)
        y_score = decision_tree.fit(X_train, Y_train).predict_proba(X_test)
        class_id = np.flatnonzero(label_binarizer.classes_ == class_of_interest)[0]

        y_score = decision_tree.fit(X_train, Y_train).predict_proba(X_test)
        fpr, tpr, thresholds = roc_curve(y_onehot_test[:, class_id], y_score[:, class_id])
        fig = px.area(
            x=fpr, y=tpr,
            title=f'<b>ROC Curve (AUC={auc(fpr, tpr):.4f}){tname}</b>',
            labels=dict(x='False Positive Rate', y='True Positive Rate'),
            width=700, height=500
         )    
  
    if (label == "Gender"):
        y_score = decision_tree.fit(X_train, Y_train).predict_proba(X_test)[:,1]
        fpr, tpr, thresholds = roc_curve(np.array(Y_test), y_score)
        fig = px.area(
            x=fpr, y=tpr,
            title=f'<b>ROC Curve (AUC={auc(fpr, tpr):.4f}) of Female 0 - Male 1</b>',
            labels=dict(x='False Positive Rate', y='True Positive Rate'),
            width=700, height=500
         )       
    if (label == "Status"):
        y_score = decision_tree.fit(X_train, Y_train).predict_proba(X_test)[:,1]
        fpr, tpr, thresholds = roc_curve(np.array(Y_test), y_score)
        fig = px.area(
            x=fpr, y=tpr,
            title=f'<b>ROC Curve(AUC={auc(fpr, tpr):.3f})Single 0-Married 1</b>',
            labels=dict(x='False Positive Rate', y='True Positive Rate'),
            width=700, height=500
         )  
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )
    fig.update_layout(width=270,height=270, font=dict(size=7),margin=dict(l=10, r=10, t=30, b=10))
    fig.update_yaxes(scaleanchor="x", scaleratio=1)
    fig.update_xaxes(constrain='domain')
    return fig
In [24]:
getROCfigure(1,['FactorHps','PriceHps','ActivityHps'],'Status',0)
In [25]:
colname11=['Gender','Age','Status','Education','Occupation','HpAnswer','NumberHps', 'ActivityHps', 'TimeHps','PriceHps','PlaceHps','FactorHps','HealthHps','InnovationHps',\
     "PB_nosound","PB_disconnect","PB_badsound","PB_unfit","PB_oneear","PB_toosd","PB_audiocut","PB_battery",\
     "PB_unplug","InfoHps_online","InfoHps_Social","InfoHps_Google","InfoHps_Store","InfoHps_PR","InfoHps_Ads"]
getROCfigure(1,colname11,'Gender','HP1')
In [26]:
getROCfigure(2,['InnovationHps'],'HpAnswer','HP2')
In [27]:
getROCfigure(3,xt.columns.values,'HpAnswer','HP3')
In [28]:
# from sklearn.preprocessing import MaxAbsScaler
# scaler = MaxAbsScaler()
# df3 = pd.DataFrame(scaler.fit_transform(df),
#                    columns=df.columns.values.tolist())
# df3['HpAnswer'] = df[['HpAnswer']].copy()
# df = df3
# df.head(10)
In [29]:
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix,precision_score,recall_score,accuracy_score
from sklearn.metrics import classification_report

#     X_train = train_df.drop(labels=[label],axis=1)
#     Y_train = train_df[label]
df3 = pd.concat([df]*20, ignore_index=True)
train_df,test_df = train_test_split(df3, test_size=0.3, random_state=42, shuffle=True)

def getROCfigure2(features,label):
    X_train = train_df[features]
    Y_train = train_df[label]
    X_test  = test_df[features]
    Y_test  = test_df[label]  
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(X_train, Y_train)
    y_scores = decision_tree.predict_proba(X_test)
    # One hot encode the labels in order to plot them
    y_onehot = pd.get_dummies(Y_test, columns=decision_tree.classes_)
    # Create an empty figure, and iteratively add new lines
    # every time we compute a new class
    fig = go.Figure()
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )

    for i in range(y_scores.shape[1]):
        y_true = y_onehot.iloc[:, i]
        y_score = y_scores[:, i]
        fpr, tpr, _ = roc_curve(y_true, y_score)
        auc_score = roc_auc_score(y_true, y_score)
        
        if(label == "HpAnswer"):
            if y_onehot.columns[i] == 1 : attrcol = "HP1"
            if y_onehot.columns[i] == 2 : attrcol = "HP2"
            if y_onehot.columns[i] == 3 : attrcol = "HP3"
        elif(label == "Gender"):
            if y_onehot.columns[i] == 0 : attrcol = "F"
            if y_onehot.columns[i] == 1 : attrcol = "M"
        elif(label == "Age"):
            if y_onehot.columns[i] == 1 : attrcol = "20-27"
            if y_onehot.columns[i] == 2 : attrcol = "28-35"
            if y_onehot.columns[i] == 3 : attrcol = "36-45"
        elif(label == "Status"):
            if y_onehot.columns[i] == 0 : attrcol = "S"
            if y_onehot.columns[i] == 1 : attrcol = "M"
        elif(label == "Education"):
            if y_onehot.columns[i] == 1 : attrcol = "G"
            if y_onehot.columns[i] == 2 : attrcol = "U"
            if y_onehot.columns[i] == 3 : attrcol = "H"
            if y_onehot.columns[i] == 4 : attrcol = "O"
        elif(label == "Occupation"):
            if y_onehot.columns[i] == 1 : attrcol = "S"
            if y_onehot.columns[i] == 2 : attrcol = "E"
            if y_onehot.columns[i] == 3 : attrcol = "B"
            if y_onehot.columns[i] == 4 : attrcol = "O"
        
        name = f"{attrcol} (AUC={auc_score:.2f})"
        fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))
    if(label == "HpAnswer"):
        labels="Headphone Type"
    else:
        labels= label
    fig.update_layout(
        xaxis_title='False Positive Rate',
        yaxis_title='True Positive Rate',
        title=f'ROC Curve of {labels}',
        yaxis=dict(scaleanchor="x", scaleratio=1),
        xaxis=dict(constrain='domain'),
        width=500,height=450,
        font=dict(size=12),margin=dict(l=50, r=50, t=100, b=100))
    
    return fig

 
def getConfusionMatrix(features,label):
    X_train = train_df[features]
    Y_train = train_df[label]
    X_test  = test_df[features]
    Y_test  = test_df[label]  
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(X_train, Y_train)
    Y_pred = decision_tree.predict(X_test)  
    cm = confusion_matrix(Y_test,Y_pred,normalize='true')
    cm1 = np.round(cm,2)
    #print(Y_train.value_counts().index.shape[0])
    #list(map(str,Y_train.value_counts().index.sort_values()))
    ames = str(classification_report(Y_test , Y_pred))
    #print(ames)
    if(label == "HpAnswer"):
        ac = ['HP1','HP2','HP3']
    elif(label == "Gender"):
        ac = ['F','M']
    elif(label == "Age"):
        ac = ['20-27','28-35', '36-45']
    elif(label == "Status"):
        ac = ['S','M']
    elif(label == "Education"):
        ac = ['G','U','H','O']
    elif(label == "Occupation"):
        if(Y_train.value_counts().index.shape[0]==3):
           ac = ['E','B','O']
        else:
           ac = ['S','E','B','O']

    fig2 = px.imshow(cm1,
                    labels=dict(x="Predicted label", y="True label"),
                    color_continuous_scale='blues',
                    aspect="auto",
                    x = ac,
                    y = ac,
                    #title=a,
                    text_auto=True)
    score =accuracy_score(Y_test , Y_pred)
    sensitivity_recall = recall_score(Y_test , Y_pred, average='weighted')
    specificity = recall_score(Y_test , Y_pred, average='weighted', pos_label=0)
    precision = precision_score(Y_test , Y_pred, average='weighted')
    acc=f"Accuracy: {round(score*100,2)}%"
    pres=f"Precision: {round(precision,3)}|"
    tpr = f"TPR: {round(sensitivity_recall,3)}|"
    fpr = f"FPR: {round(1-specificity,3)}|"
    spe = f"Specificity: {round(specificity,3)}"
    ames= ames.replace('\n',"<br>")
    text23=f"avg_ {tpr}{fpr}{spe}"
#     fig2.add_annotation(text=ames+text23, 
#                     align='left',
#                     showarrow=False,
#                     xref='paper',
#                     yref='paper',
#                     x=1.0,
#                     y=2.3,
#                     bordercolor='black',
#                     borderwidth=1)   
    fig2.update_xaxes(side="bottom")
    fig2.update_layout(width=360,height=380, font=dict(size=15),margin=dict(l=0, r=0, t=0, b=0))
    fig2.update_coloraxes(showscale=False)
    
    return fig2
In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt

X_train = train_df[['FactorHps','PriceHps','ActivityHps',"InnovationHps"]]
Y_train = train_df[['Gender']]
X_test  = test_df[['FactorHps','PriceHps','ActivityHps',"InnovationHps"]]
Y_test  = test_df[['Gender']]  
decision_tree.fit(X_train, Y_train)
y_pred_proba = decision_tree.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(Y_test,  y_pred_proba)
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
No description has been provided for this image
In [31]:
colname
Out[31]:
['ActivityHps',
 'PriceHps',
 'PlaceHps',
 'FactorHps',
 'InnovationHps',
 'InfoHps_online',
 'InfoHps_Store',
 'InfoHps_PR']
In [32]:
getROCfigure2(colname,'HpAnswer')
In [33]:
getConfusionMatrix(colname,'HpAnswer')
/srv/conda/envs/notebook/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1396: UserWarning:

Note that pos_label (set to 0) is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class.

In [34]:
colname=['FactorHps','PriceHps','ActivityHps',"InnovationHps"]
In [35]:
getROCfigure2(colname,'HpAnswer')
In [36]:
getConfusionMatrix(colname,'HpAnswer')
/srv/conda/envs/notebook/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1396: UserWarning:

Note that pos_label (set to 0) is ignored when average != 'binary' (got 'weighted'). You may use labels=[pos_label] to specify a single positive class.

In [37]:
import dalex as dx


from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

df3 = pd.concat([df]*20, ignore_index=True)

train_df,test_df = train_test_split(df3, test_size=0.3, random_state=42, shuffle=True)

X = train_df[['Gender','Age','Status','Education','Occupation','FactorHps','ActivityHps','PriceHps']]
y = train_df["HealthHps"]

numerical_features = ['Gender','Age','Status','Education','Occupation','FactorHps', 'ActivityHps', 'PriceHps']
numerical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

# categorical_features = ['gender', 'class', 'embarked']
# categorical_transformer = Pipeline(
#     steps=[
#         ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#         ('onehot', OneHotEncoder(handle_unknown='ignore'))
#     ]
# )

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
       # ('cat', categorical_transformer, categorical_features)
    ]
)

classifier = MLPClassifier(hidden_layer_sizes=(150,100,50), max_iter=500, random_state=0)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
In [38]:
clf.fit(X, y)
Out[38]:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Gender', 'Age', 'Status',
                                                   'Education', 'Occupation',
                                                   'FactorHps', 'ActivityHps',
                                                   'PriceHps'])])),
                ('classifier',
                 MLPClassifier(hidden_layer_sizes=(150, 100, 50), max_iter=500,
                               random_state=0))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Gender', 'Age', 'Status',
                                                   'Education', 'Occupation',
                                                   'FactorHps', 'ActivityHps',
                                                   'PriceHps'])])),
                ('classifier',
                 MLPClassifier(hidden_layer_sizes=(150, 100, 50), max_iter=500,
                               random_state=0))])
ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['Gender', 'Age', 'Status', 'Education',
                                  'Occupation', 'FactorHps', 'ActivityHps',
                                  'PriceHps'])])
['Gender', 'Age', 'Status', 'Education', 'Occupation', 'FactorHps', 'ActivityHps', 'PriceHps']
SimpleImputer(strategy='median')
StandardScaler()
MLPClassifier(hidden_layer_sizes=(150, 100, 50), max_iter=500, random_state=0)
In [39]:
exp = dx.Explainer(clf, X, y)
Preparation of a new explainer is initiated

  -> data              : 1008 rows 8 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 1008 values
  -> model_class       : sklearn.neural_network._multilayer_perceptron.MLPClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x7f78300cec20> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 4.02e-08, mean = 0.614, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.323, mean = 0.00474, max = 0.677
  -> model_info        : package sklearn

A new explainer has been created!
In [40]:
# ['Gender','Age','Status','Education','Occupation','HpAnswer','NumberHps', 'ActivityHps', 'TimeHps',\
#                       'PriceHps','PlaceHps','FactorHps','HealthHps','InnovationHps',"PB_nosound","PB_disconnect","PB_badsound",\
#                       "PB_unfit","PB_oneear","PB_toosd","PB_audiocut","PB_battery","PB_unplug","InfoHps_online",\
#                       "InfoHps_Social","InfoHps_Google","InfoHps_Store","InfoHps_PR","InfoHps_Ads"]

me = pd.DataFrame({'Gender': [0],
                   'Age': [2],
                   'Status': [1],
                   'Education':[2],
                   'Occupation':[2],
                   'FactorHps':[1],
                   'ActivityHps':[2],
                   'PriceHps':[1]},
                    index = ['Me'])
In [41]:
exp.predict(me)
Out[41]:
array([6.92954416e-05])
In [42]:
bd_me = exp.predict_parts(me, type='break_down', label=me.index[0])
# bd_interactions_me = exp.predict_parts(me, type='break_down_interactions', label="me+")

# sh_me = exp.predict_parts(me, type='shap', B = 10, label=me.index[0])
In [43]:
bd_me.result
Out[43]:
variable_name variable_value variable cumulative contribution sign position label
0 intercept intercept 0.614307 0.614307 1.0 9 Me
1 PriceHps 1.0 PriceHps = 1.0 0.743928 0.129621 1.0 8 Me
2 Education 2.0 Education = 2.0 0.769384 0.025456 1.0 7 Me
3 Occupation 2.0 Occupation = 2.0 0.885900 0.116516 1.0 6 Me
4 Gender 0.0 Gender = 0.0 0.834517 -0.051383 -1.0 5 Me
5 Age 2.0 Age = 2.0 0.840359 0.005843 1.0 4 Me
6 FactorHps 1.0 FactorHps = 1.0 0.843857 0.003498 1.0 3 Me
7 ActivityHps 2.0 ActivityHps = 2.0 0.768682 -0.075175 -1.0 2 Me
8 Status 1.0 Status = 1.0 0.000069 -0.768613 -1.0 1 Me
9 prediction 0.000069 0.000069 1.0 0 Me
In [44]:
bd_me.plot()
In [45]:
import dalex as dx

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

X = train_df[['Gender','Age','Status','Education','Occupation','FactorHps', 'ActivityHps', 'PriceHps', 'InnovationHps']]
y = train_df["HpAnswer"]

model = RandomForestClassifier(n_estimators=100)
model.fit(X, y) 
model.predict_proba(X).shape
Out[45]:
(1008, 3)
In [46]:
# custom (binary) predict function
pf_0 = lambda m, d: m.predict_proba(d)[:, 0]

# custom (binary) target values
y_0 = y == 1

# explainer
exp_0 = dx.Explainer(model, X, y_0, predict_function=pf_0, label="RFClassifier: class 0")
Preparation of a new explainer is initiated

  -> data              : 1008 rows 9 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 1008 values
  -> model_class       : sklearn.ensemble._forest.RandomForestClassifier (default)
  -> label             : RFClassifier: class 0
  -> predict function  : <function <lambda> at 0x7f782abbf760> will be used
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.0, mean = 0.161, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = 0.0, mean = 0.0, max = 0.0
  -> model_info        : package sklearn

A new explainer has been created!
In [47]:
exp_0.model_parts()
Out[47]:
variable dropout_loss label
0 Age 0.000000 RFClassifier: class 0
1 Gender 0.000000 RFClassifier: class 0
2 Occupation 0.000000 RFClassifier: class 0
3 Status 0.000000 RFClassifier: class 0
4 _full_model_ 0.000000 RFClassifier: class 0
5 InnovationHps 0.003586 RFClassifier: class 0
6 ActivityHps 0.012423 RFClassifier: class 0
7 FactorHps 0.044385 RFClassifier: class 0
8 Education 0.069562 RFClassifier: class 0
9 PriceHps 0.078651 RFClassifier: class 0
10 _baseline_ 0.493904 RFClassifier: class 0
In [48]:
exp_list = []

for i in range(len(np.unique(y))):
    # add i parameter to `predict_function` just to do it in a loop
    pf = lambda m, d, i=i: m.predict_proba(d)[:, i]
    e = dx.Explainer(
        model, X, 
        y == i+1, 
        predict_function=pf,
        label=f'RFClassifier: class {i+1}',
        verbose=False
    )
    exp_list += [e]

exp_list
Out[48]:
[<dalex._explainer.object.Explainer at 0x7f782adf48b0>,
 <dalex._explainer.object.Explainer at 0x7f782adf6560>,
 <dalex._explainer.object.Explainer at 0x7f782abf5b10>]
In [49]:
m_profile_list = [e.model_profile() for e in exp_list]
Calculating ceteris paribus: 100%|██████████| 9/9 [00:01<00:00,  5.11it/s]
Calculating ceteris paribus: 100%|██████████| 9/9 [00:01<00:00,  5.38it/s]
Calculating ceteris paribus: 100%|██████████| 9/9 [00:01<00:00,  5.52it/s]
In [50]:
m_profile_list[0].plot(m_profile_list[1:])
In [51]:
m_parts_list = [e.model_parts() for e in exp_list]
m_parts_list[0].plot(m_parts_list[1:])
In [52]:
# choose a data point to explain
# observation = X.iloc[[0]]
# observation
observation = pd.DataFrame({'Gender': [1],
                   'Age': [2],
                   'Status': [0],
                   'Education':[2],
                   'Occupation':[2],
                   'FactorHps':[3],
                   'ActivityHps':[2],
                   'InnovationHps':[2],
                   'PriceHps':[2]},
                    index = ['Me'])
#p_parts_list = [e.predict_parts(observation) for e in exp_list]
#p_parts_list[0].plot(p_parts_list[1:], min_max=[-0.1, 1.1])
In [53]:
p_parts_list = [e.predict_parts(observation) for e in exp_list]
#p_parts_list = [e.predict_parts(observation).result for e in exp_list]
p_parts_list[1]
Out[53]:
variable_name variable_value variable cumulative contribution sign position label
0 intercept intercept 0.590278 0.590278 1.0 8 RFClassifier: class 2
1 InnovationHps:PriceHps 2.0:2.0 InnovationHps:PriceHps = 2.0:2.0 0.595764 0.005486 1.0 7 RFClassifier: class 2
2 ActivityHps 2.0 ActivityHps = 2.0 0.527044 -0.068720 -1.0 6 RFClassifier: class 2
3 Education 2.0 Education = 2.0 0.467361 -0.059683 -1.0 5 RFClassifier: class 2
4 FactorHps 3.0 FactorHps = 3.0 0.545665 0.078304 1.0 4 RFClassifier: class 2
5 Age:Gender 2.0:1.0 Age:Gender = 2.0:1.0 0.526538 -0.019127 -1.0 3 RFClassifier: class 2
6 Occupation 2.0 Occupation = 2.0 0.559196 0.032659 1.0 2 RFClassifier: class 2
7 Status 0.0 Status = 0.0 0.580000 0.020804 1.0 1 RFClassifier: class 2
8 prediction 0.580000 0.580000 1.0 0 RFClassifier: class 2
In [54]:
p_parts_list[0].plot(p_parts_list[1:], min_max=[-0.1, 1.1])
In [55]:
MLdecision_tree(observation)
acc_random_forest_hp: 100.0
acc_random_forest_gender: 100.0
acc_random_forest_age: 100.0
acc_random_forest_status: 100.0
acc_random_forest_edu: 100.0
acc_random_forest_occ: 100.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        58
           2       1.00      1.00      1.00       245
           3       1.00      1.00      1.00       129

    accuracy                           1.00       432
   macro avg       1.00      1.00      1.00       432
weighted avg       1.00      1.00      1.00       432

Out[55]:
('HP2',
 'male',
 '28-35',
 'single',
 'undergraduate',
 'employee',
 2,
 1,
 2,
 0,
 2,
 2,
 array([[0.18, 0.61, 0.21]]))
In [56]:
pd.concat((p_parts_list[0].result,p_parts_list[1].result,p_parts_list[2].result))
Out[56]:
variable_name variable_value variable cumulative contribution sign position label
0 intercept intercept 0.160714 0.160714 1.0 8 RFClassifier: class 1
1 ActivityHps 2.0 ActivityHps = 2.0 0.243115 0.082401 1.0 7 RFClassifier: class 1
2 FactorHps 3.0 FactorHps = 3.0 0.196964 -0.046151 -1.0 6 RFClassifier: class 1
3 PriceHps:Gender 2.0:1.0 PriceHps:Gender = 2.0:1.0 0.141111 -0.055853 -1.0 5 RFClassifier: class 1
4 InnovationHps:Education 2.0:2.0 InnovationHps:Education = 2.0:2.0 0.142282 0.001171 1.0 4 RFClassifier: class 1
5 Status 0.0 Status = 0.0 0.154812 0.012530 1.0 3 RFClassifier: class 1
6 Age 2.0 Age = 2.0 0.186607 0.031796 1.0 2 RFClassifier: class 1
7 Occupation 2.0 Occupation = 2.0 0.180000 -0.006607 -1.0 1 RFClassifier: class 1
8 prediction 0.180000 0.180000 1.0 0 RFClassifier: class 1
0 intercept intercept 0.590278 0.590278 1.0 8 RFClassifier: class 2
1 InnovationHps:PriceHps 2.0:2.0 InnovationHps:PriceHps = 2.0:2.0 0.595764 0.005486 1.0 7 RFClassifier: class 2
2 ActivityHps 2.0 ActivityHps = 2.0 0.527044 -0.068720 -1.0 6 RFClassifier: class 2
3 Education 2.0 Education = 2.0 0.467361 -0.059683 -1.0 5 RFClassifier: class 2
4 FactorHps 3.0 FactorHps = 3.0 0.545665 0.078304 1.0 4 RFClassifier: class 2
5 Age:Gender 2.0:1.0 Age:Gender = 2.0:1.0 0.526538 -0.019127 -1.0 3 RFClassifier: class 2
6 Occupation 2.0 Occupation = 2.0 0.559196 0.032659 1.0 2 RFClassifier: class 2
7 Status 0.0 Status = 0.0 0.580000 0.020804 1.0 1 RFClassifier: class 2
8 prediction 0.580000 0.580000 1.0 0 RFClassifier: class 2
0 intercept intercept 0.249008 0.249008 1.0 7 RFClassifier: class 3
1 Education 2.0 Education = 2.0 0.319177 0.070169 1.0 6 RFClassifier: class 3
2 InnovationHps:PriceHps 2.0:2.0 InnovationHps:PriceHps = 2.0:2.0 0.320665 0.001488 1.0 5 RFClassifier: class 3
3 FactorHps:Gender 3.0:1.0 FactorHps:Gender = 3.0:1.0 0.326835 0.006171 1.0 4 RFClassifier: class 3
4 Age 2.0 Age = 2.0 0.366687 0.039851 1.0 3 RFClassifier: class 3
5 Occupation 2.0 Occupation = 2.0 0.351230 -0.015456 -1.0 2 RFClassifier: class 3
6 ActivityHps:Status 2.0:0.0 ActivityHps:Status = 2.0:0.0 0.240000 -0.111230 -1.0 1 RFClassifier: class 3
7 prediction 0.240000 0.240000 1.0 0 RFClassifier: class 3
In [57]:
p_parts_list[1].result
Out[57]:
variable_name variable_value variable cumulative contribution sign position label
0 intercept intercept 0.590278 0.590278 1.0 8 RFClassifier: class 2
1 InnovationHps:PriceHps 2.0:2.0 InnovationHps:PriceHps = 2.0:2.0 0.595764 0.005486 1.0 7 RFClassifier: class 2
2 ActivityHps 2.0 ActivityHps = 2.0 0.527044 -0.068720 -1.0 6 RFClassifier: class 2
3 Education 2.0 Education = 2.0 0.467361 -0.059683 -1.0 5 RFClassifier: class 2
4 FactorHps 3.0 FactorHps = 3.0 0.545665 0.078304 1.0 4 RFClassifier: class 2
5 Age:Gender 2.0:1.0 Age:Gender = 2.0:1.0 0.526538 -0.019127 -1.0 3 RFClassifier: class 2
6 Occupation 2.0 Occupation = 2.0 0.559196 0.032659 1.0 2 RFClassifier: class 2
7 Status 0.0 Status = 0.0 0.580000 0.020804 1.0 1 RFClassifier: class 2
8 prediction 0.580000 0.580000 1.0 0 RFClassifier: class 2
In [58]:
adata = p_parts_list[2].result

adata['contribution'] = round(adata['contribution'],3)
adata
Out[58]:
variable_name variable_value variable cumulative contribution sign position label
0 intercept intercept 0.249008 0.249 1.0 7 RFClassifier: class 3
1 Education 2.0 Education = 2.0 0.319177 0.070 1.0 6 RFClassifier: class 3
2 InnovationHps:PriceHps 2.0:2.0 InnovationHps:PriceHps = 2.0:2.0 0.320665 0.001 1.0 5 RFClassifier: class 3
3 FactorHps:Gender 3.0:1.0 FactorHps:Gender = 3.0:1.0 0.326835 0.006 1.0 4 RFClassifier: class 3
4 Age 2.0 Age = 2.0 0.366687 0.040 1.0 3 RFClassifier: class 3
5 Occupation 2.0 Occupation = 2.0 0.351230 -0.015 -1.0 2 RFClassifier: class 3
6 ActivityHps:Status 2.0:0.0 ActivityHps:Status = 2.0:0.0 0.240000 -0.111 -1.0 1 RFClassifier: class 3
7 prediction 0.240000 0.240 1.0 0 RFClassifier: class 3
In [59]:
import plotly.express as px

def get_explaination(observation):
    X = train_df[observation.columns.values]
    y = train_df["HpAnswer"]
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X, y) 
    exp_list = []

    for i in range(len(np.unique(y))):
        # add i parameter to `predict_function` just to do it in a loop
        pf = lambda m, d, i=i: m.predict_proba(d)[:, i]
        e = dx.Explainer(
            model, X, 
            y == i, 
            predict_function=pf,
            label=f'{i+1}',
            verbose=False
        )
        exp_list += [e]
    
    p_parts_list = [e.predict_parts(observation) for e in exp_list]
    adata = pd.concat((p_parts_list[0].result,p_parts_list[1].result,p_parts_list[2].result))    
    adata['contribution'] = round(adata['contribution'],3)
    pt = adata.loc[adata['variable']=="prediction"]
    pmax = pt['contribution'].max()
    pt = pt.loc[pt['contribution']==pmax]
    htype = pt['label'].iloc[0]

    adata = adata.loc[adata['label']==htype]
    #print(adata)
    fig = go.Figure()
    bcolors = []
    adata['variable_name'].replace("",'prediction', inplace=True)
    for i in range(len(adata['contribution'])):
        if (adata.iloc[i]['contribution'] < 0):
               bcolors.append("red")
        elif(adata.iloc[i]['contribution'] > 0 and adata.iloc[i]['variable_name']!="prediction" \
             and adata.iloc[i]['variable_name']!="intercept"):
               bcolors.append("green")
        elif(adata.iloc[i]['variable_name']=="prediction"):
               bcolors.append("blue")
        elif(adata.iloc[i]['variable_name']=="intercept"):
               bcolors.append("white")
    fig.add_trace(go.Bar(
        y= adata['variable_name'],
        x=adata['contribution'],
        orientation='h',
        text = adata['contribution'],
        marker=dict(
            color=bcolors,
            #line=dict(color='rgba(246, 78, 139, 1.0)', width=3)
        )
    ))
    fig.update_layout(
        title= f'<b>Explaination: product selection HP{htype}</b>',
        margin=dict(l=0, r=0, t=35, b=10),
        #height=300
    )

    return fig

def get_exresult(observation):
    
    X = train_df[observation.columns.values]
    y = train_df["HpAnswer"]
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X, y) 
    exp_list = []

    for i in range(len(np.unique(y))):
        # add i parameter to `predict_function` just to do it in a loop
        pf = lambda m, d, i=i: m.predict_proba(d)[:, i]
        e = dx.Explainer(
            model, X, 
            y == i, 
            predict_function=pf,
            label=f'{i+1}',
            verbose=False
        )
        exp_list += [e]
    
    p_parts_list = [e.predict_parts(observation) for e in exp_list]
    adata2 = pd.concat((p_parts_list[0].result,p_parts_list[1].result,p_parts_list[2].result))    
    adata = pd.concat((p_parts_list[0].result,p_parts_list[1].result,p_parts_list[2].result))    
    adata['contribution'] = round(adata['contribution'],3)
    pt = adata.loc[adata['variable']=="prediction"]
    pmax = pt['contribution'].max()
    pt = pt.loc[pt['contribution']==pmax]
    htype = pt['label'].iloc[0]
    adata = adata.loc[adata['label']==htype]

    return htype,adata2
In [60]:
observation2 = pd.DataFrame({'Gender': [1],
                   'Age': [2],
                   'Status': [0],
                   'Education':[2],
                   'Occupation':[2],
                   'FactorHps':[5],
                   'ActivityHps':[2],
                   'InnovationHps':[2],
                   'PriceHps':[2]})
In [61]:
get_explaination(observation)
In [62]:
#get_exresult(observation)
get_explaination(observation)
In [63]:
get_exresult(observation)
Out[63]:
('2',
              variable_name variable_value                           variable  \
 0                intercept                                         intercept   
 1              ActivityHps            2.0                  ActivityHps = 2.0   
 2                FactorHps            3.0                    FactorHps = 3.0   
 3          PriceHps:Gender        2.0:1.0          PriceHps:Gender = 2.0:1.0   
 4                      Age            2.0                          Age = 2.0   
 5                   Status            0.0                       Status = 0.0   
 6  InnovationHps:Education        2.0:2.0  InnovationHps:Education = 2.0:2.0   
 7               Occupation            2.0                   Occupation = 2.0   
 8                                                                 prediction   
 0                intercept                                         intercept   
 1       PriceHps:FactorHps        2.0:3.0       PriceHps:FactorHps = 2.0:3.0   
 2              ActivityHps            2.0                  ActivityHps = 2.0   
 3            InnovationHps            2.0                InnovationHps = 2.0   
 4                Education            2.0                    Education = 2.0   
 5               Age:Gender        2.0:1.0               Age:Gender = 2.0:1.0   
 6                   Status            0.0                       Status = 0.0   
 7               Occupation            2.0                   Occupation = 2.0   
 8                                                                 prediction   
 0                intercept                                         intercept   
 1                Education            2.0                    Education = 2.0   
 2       PriceHps:FactorHps        2.0:3.0       PriceHps:FactorHps = 2.0:3.0   
 3                      Age            2.0                          Age = 2.0   
 4            InnovationHps            2.0                InnovationHps = 2.0   
 5        Occupation:Gender        2.0:1.0        Occupation:Gender = 2.0:1.0   
 6       ActivityHps:Status        2.0:0.0       ActivityHps:Status = 2.0:0.0   
 7                                                                 prediction   
 
    cumulative  contribution  sign  position label  
 0    0.160714      0.160714   1.0         8     1  
 1    0.238562      0.077847   1.0         7     1  
 2    0.201181     -0.037381  -1.0         6     1  
 3    0.115853     -0.085327  -1.0         5     1  
 4    0.122550      0.006696   1.0         4     1  
 5    0.124583      0.002034   1.0         3     1  
 6    0.155813      0.031230   1.0         2     1  
 7    0.160000      0.004187   1.0         1     1  
 8    0.160000      0.160000   1.0         0     1  
 0    0.590278      0.590278   1.0         8     2  
 1    0.717252      0.126974   1.0         7     2  
 2    0.659514     -0.057738  -1.0         6     2  
 3    0.624613     -0.034901  -1.0         5     2  
 4    0.556944     -0.067669  -1.0         4     2  
 5    0.581706      0.024762   1.0         3     2  
 6    0.599524      0.017817   1.0         2     2  
 7    0.620000      0.020476   1.0         1     2  
 8    0.620000      0.620000   1.0         0     2  
 0    0.249008      0.249008   1.0         7     3  
 1    0.310823      0.061815   1.0         6     3  
 2    0.274613     -0.036210  -1.0         5     3  
 3    0.235962     -0.038651  -1.0         4     3  
 4    0.242183      0.006220   1.0         3     3  
 5    0.315853      0.073671   1.0         2     3  
 6    0.220000     -0.095853  -1.0         1     3  
 7    0.220000      0.220000   1.0         0     3  )
In [ ]:
 
In [ ]:
 
In [ ]: